In [1]:
from __future__ import division
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
from os.path import expanduser
In [2]:
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer # combines counting and normalizing
from sklearn.pipeline import Pipeline
from sklearn.metrics import silhouette_samples, silhouette_score
In [3]:
%matplotlib inline
#%qtconsole
In [4]:
import os
data_folder = 'corpus_haiku'
# Note the use of read(); readlines() does not work
documents = [open(os.path.join(data_folder, filename)).read()
for filename in os.listdir(data_folder)]
by choosing a relatively low max_df for the vectorizer we remove a majority of the frequently-occuring function words we used for authorship determination and the TF-IDF algorithm will identify the important words.
In [5]:
import matplotlib.cm as cm
# ========================= PARAMETERS =========================
kmeans_kwargs = dict(copy_x = False, # improve numerical accuracy
n_jobs = -1 # use all available cores
)
tfidf_kwargs = dict(max_df = 0.4, # select only words that appear in
# no more than 40% of the documents
stop_words = 'english'
)
range_n_clusters = range(2,20+1) # K values
#metric = 'euclidean'
#metric = 'cityblock'
metric = 'cosine'
#metric = 'l1'
#metric = 'l2'
#metric = 'manhattan'
verbose = False # print out lovely graphs and summary?
# ========================= PARAMETERS =========================
summary = [] # list of results
for n_clusters in range_n_clusters:
# Initialize the clusterer with n_clusters value and a random generator
# seed of 10 for reproducibility.
vectorizer = TfidfVectorizer(**tfidf_kwargs)
clusterer = KMeans(n_clusters=n_clusters, random_state=10, **kmeans_kwargs)
pipeline = Pipeline([ ('tfidf', vectorizer),
('kmeans', clusterer)])
pipeline.fit(documents)
cluster_labels = pipeline.predict(documents)
X = pipeline.transform(documents)
# The silhouette_score gives the average value for all the samples.
# This gives a perspective into the density and separation of the formed
# clusters
silhouette_avg = silhouette_score(X, cluster_labels, metric=metric)
# Compute the silhouette scores for each sample
sample_silhouette_values = silhouette_samples(X, cluster_labels, metric=metric)
silhouette_avg_pos = np.array(sample_silhouette_values)[np.array(sample_silhouette_values)>0].mean()
silhouette_avg_neg = np.array(sample_silhouette_values)[np.array(sample_silhouette_values)<0].mean()
if np.isnan(silhouette_avg_pos): silhouette_avg_pos = 0
if np.isnan(silhouette_avg_neg): silhouette_avg_neg = 0
if verbose: print("For n_clusters = {} the average silhouette_score is : {:.4f}; pos:{:.4f}, neg:{:.4f}".\
format(n_clusters, silhouette_avg, silhouette_avg_pos, silhouette_avg_neg))
summary.append((n_clusters, silhouette_avg, silhouette_avg_pos, silhouette_avg_neg))
if verbose:
# Create a subplot with 1 row and 2 columns
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(18, 7)
# The 1st subplot is the silhouette plot
# ===============
# The silhouette coefficient can range from -1, 1 but in this example all
# lie within [-0.1, 1]
ax1.set_xlim([-1, 1])
# The (n_clusters+1)*10 is for inserting blank space between silhouette
# plots of individual clusters, to demarcate them clearly.
ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])
y_lower = 10
for i in range(n_clusters):
# Aggregate the silhouette scores for samples belonging to
# cluster i, and sort them
ith_cluster_silhouette_values = \
sample_silhouette_values[cluster_labels == i]
ith_cluster_silhouette_values.sort()
size_cluster_i = ith_cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i
color = cm.spectral(float(i) / n_clusters)
ax1.fill_betweenx(np.arange(y_lower, y_upper),
0, ith_cluster_silhouette_values,
facecolor=color, edgecolor=color, alpha=0.7)
# Label the silhouette plots with their cluster numbers at the middle
ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
# Compute the new y_lower for next plot
y_lower = y_upper + 10 # 10 for the 0 samples
ax1.set_title("The silhouette plot for the various clusters.")
ax1.set_xlabel("The silhouette coefficient values")
ax1.set_ylabel("Cluster label")
# The vertical line for average silhoutte score of all the values
ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
if silhouette_avg_pos>0: ax1.axvline(x=silhouette_avg_pos, color="blue", linestyle="--")
if silhouette_avg_neg<0: ax1.axvline(x=silhouette_avg_neg, color="blue", linestyle="--")
ax1.set_yticks([]) # Clear the yaxis labels / ticks
ax1.set_xticks([-1, -0.8, -0.6, -0.4, -0.2, 0, 0.2, 0.4, 0.6, 0.8, 1])
# 2nd Plot showing the actual clusters formed
# ========
colors = cm.spectral(cluster_labels.astype(float) / n_clusters)
ax2.scatter(X[:, 0], X[:, 1], marker='.', s=30, lw=0, alpha=0.7,c=colors)
# Labeling the clusters
centers = clusterer.cluster_centers_
# Draw white circles at cluster centers
ax2.scatter(centers[:, 0], centers[:, 1],
marker='o', c="white", alpha=1, s=200)
for i, c in enumerate(centers):
ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1, s=50)
ax2.set_title("The visualization of the clustered data.")
ax2.set_xlabel("Feature space for the 1st feature")
ax2.set_ylabel("Feature space for the 2nd feature")
plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
"with n_clusters = %d (metric=%s)" % (n_clusters,metric)),
fontsize=14, fontweight='bold')
plt.show()
x_vals = [x[0] for x in summary]
y_avg = [x[1] for x in summary]
y_pos = [x[2] for x in summary]
y_neg = [x[3] for x in summary]
if verbose:
plt.plot(x_vals,y_avg, label='avg')
plt.xlim([0,len(x_vals)+2])
plt.axvline(x=np.argmax(y_avg)+2, color="red", linestyle="--")
plt.title('Silhouette Scores by KMeans K value')
plt.xlabel('KMeans K value')
plt.ylabel('Silhoutette Score')
plt.plot(x_vals,y_pos, label='pos', linestyle="--")
# plt.plot(x_vals,y_neg)
plt.legend(loc='best')
plt.show()
print("The silhouette scores in descending order for metric = {}".format(metric))
import operator
ordered_summary = sorted(summary, key = operator.itemgetter(1), reverse=True)
for cnum, avg, _, _ in ordered_summary:
print cnum, "\t", avg
In [6]:
top_four = np.where(y_avg>np.percentile(y_avg,60))[0]+2
for n_clusters in top_four:
pipe = Pipeline( [('vec', TfidfVectorizer(**tfidf_kwargs)),
('kmeans', KMeans(n_clusters=n_clusters, **kmeans_kwargs))])
pipe.fit(documents)
labels = pipe.predict(documents)
# how many documents in each cluster?
# ===================================
from collections import Counter
c = Counter(labels)
plt.bar(c.keys(),c.values())
plt.title('Distribution of Philadelphia Reflections haiku clusters')
plt.xlabel('Cluster Number')
plt.ylabel('Frequency of Haiku in each Cluster')
for k,v in c.iteritems():
plt.text(k+0.25,v+5,v)
plt.show()
# get the term list extracted by Tfidfvectorizer
# ==============================================
terms = pipe.named_steps['vec'].get_feature_names()
# find the five most-important terms in each cluster
# ==================================================
cluster_avg5_scores = []
for cluster_number in range(n_clusters):
print("\nFor cluster {}, {:,} documents".format(cluster_number, c[cluster_number]))
centroid = pipe.named_steps['kmeans'].cluster_centers_[cluster_number]
most_important = centroid.argsort()
total_score = 0
for i in range(5):
term_index = most_important[-(i+1)]
total_score += centroid[term_index]
print("{}) {} (score: {:.4f})".\
format(i+1, terms[term_index].encode('utf-8'), centroid[term_index]))
print("Average cluster score {:.4f}".format(total_score/5.0))
cluster_avg5_scores.append((cluster_number, total_score/5.0))
print("\nThe clusters by average score, descending")
import operator
cluster_avg5_scores.sort(key = operator.itemgetter(1), reverse=True)
for cnum, avg in cluster_avg5_scores:
print cnum, "\t", avg
print('----------------------------------------------------------------')
In [ ]: